knitr::opts_chunk$set(message=FALSE, warning=FALSE, eval=T, cache=F)

Intro

For today’s workshop, we’re going to use R to go through a typical bioinformatics analysis workflow. We’re going to use common bioinformatics techniques to visualize data and make beautiful figures.

The data we will analyze is breast cancer RNA-Seq data from TCGA, a popular publicly-available database for cancer-related datasets. The goal of the analysis will be to identify genes that show significant changes in expression between normal and tumor tissues, followed by identifying the pathways they are associated with. After importing the data and performing some data pre-processing, we will carry out differntial expression analysis and gene set enrichment analysis.

Main steps in today’s workshop:

  1. Import and pre-process RNA-Seq data
  2. Identify differentially-expressed genes between tumor and control samples
  3. Identify significantly-enriched pathways in the gene sets

Make sure to have the following packages installed for this workshop:

Working with Expression Set Objects

An expression set is a data object consisting of three entities: the expression matrix (exprs), the phenotye data (pData), and the feature data (fData).

We read in the RDS file included in this repo. It corresponds to a subset of samples from a gene expression dataset of breast cancer (BRCA) primary tissue samples from the TCGA project.

library(Biobase)
library(magrittr)
library(dplyr)
library(ggplot2)
library(Biobase)
library(ggfortify)
library(plotly)
brca <- readRDS("data/TCGA-BRCA.rds")

# dimensions of the expression data
dim(brca)
## Features  Samples 
##    36812     1222
# dimensions of the gene annotation
dim(fData(brca))
## [1] 36812     4
# first few rows of gene annotations
head(fData(brca)[,c("ensembl_transcript_id", "ensembl_gene_id", "hgnc_symbol")])
##          ensembl_transcript_id ensembl_gene_id hgnc_symbol
## TSPAN6      ENSG00000000003.13 ENSG00000000003      TSPAN6
## TNMD         ENSG00000000005.5 ENSG00000000005        TNMD
## DPM1        ENSG00000000419.11 ENSG00000000419        DPM1
## SCYL3       ENSG00000000457.12 ENSG00000000457       SCYL3
## C1orf112    ENSG00000000460.15 ENSG00000000460    C1orf112
## FGR         ENSG00000000938.11 ENSG00000000938         FGR
# dimensions of the phenotypic annotation
dim(pData(brca))
## [1] 1222   65
# first few rows of phenotype
head(pData(brca)[,c("patient_id", "sample_type", "tumor_subtype")])
##                                patient_id   sample_type tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 Primary Tumor          LumB
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY Primary Tumor          LumA
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z Primary Tumor          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU Primary Tumor          LumA
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN Primary Tumor          Her2
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W Primary Tumor          LumB
# how many of each sample type?
table(pData(brca)$sample_type)
## 
##          Metastatic       Primary Tumor Solid Tissue Normal 
##                   7                1102                 113
# how many tumor subtypes?
table(pData(brca)$tumor_subtype)
## 
##  Basal   Her2   LumA   LumB Normal 
##    169    209    510    198     16

Log transform the data set

exprs(brca) <- log2(exprs(brca) + 1)
exprs(brca)[1:5,1:5]
##          TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A2-A0SY-01A-31R-A084-07
## TSPAN6                      13.975579                    10.981567
## TNMD                         1.584963                     6.189825
## DPM1                        11.156715                    10.822571
## SCYL3                       10.590587                    10.946906
## C1orf112                     9.519636                     9.339850
##          TCGA-AR-A24Z-01A-11R-A169-07 TCGA-D8-A1XU-01A-11R-A14M-07
## TSPAN6                      12.302353                    12.463013
## TNMD                         4.459432                     2.807355
## DPM1                        11.945444                    12.266494
## SCYL3                       10.611025                    11.149747
## C1orf112                     9.388017                     9.400879
##          TCGA-A1-A0SN-01A-11R-A144-07
## TSPAN6                       9.714246
## TNMD                         1.000000
## DPM1                        12.419960
## SCYL3                       11.136350
## C1orf112                     9.884171

PCA

Start by ranking genes based on their variation across samples

row.var <- sort(apply(exprs(brca), 1, var), decreasing=TRUE)
head(row.var)
##   CLEC3A  SCGB2A2     CPB1     TFF1  SCGB1D2    KCNJ3 
## 29.73892 25.49291 24.59669 21.00591 20.25785 19.56774

To save time, we’ll run PCA on the top 2500 most variable genes

df <- brca[names(row.var)[1:2500]] %>%
      exprs() %>%
      t() %>%
      data.frame()
    
pca <- prcomp(df)
pca.summary <- summary(pca)
pca.summary$importance[,1:5]
##                             PC1      PC2      PC3      PC4      PC5
## Standard deviation     47.42767 40.65232 29.32491 23.58570 20.28785
## Proportion of Variance  0.14934  0.10972  0.05709  0.03693  0.02733
## Cumulative Proportion   0.14934  0.25906  0.31615  0.35309  0.38041

2-D Plot

df$tumor_subtype <- brca$tumor_subtype
autoplot(pca, data=df, colour='tumor_subtype')

3-D Plot

df.pca <- cbind(pca$x[,c(1:3)], brca$tumor_subtype) %>%
          as.data.frame() %>%
          set_colnames(c("PC1", "PC2", "PC3", "tumor_subtype"))

head(df.pca)
##                                            PC1               PC2
## TCGA-A8-A085-01A-11R-A00Z-07 -89.9895594300379  14.4728151158205
## TCGA-A2-A0SY-01A-31R-A084-07  8.62013654000677 -39.5325958432916
## TCGA-AR-A24Z-01A-11R-A169-07 -41.8816734638114 -26.6304385642973
## TCGA-D8-A1XU-01A-11R-A14M-07 -22.8004029255938 -38.3033437021283
## TCGA-A1-A0SN-01A-11R-A144-07 -26.3108798601652  1.88579216258153
## TCGA-D8-A73W-01A-22R-A352-07 -46.7262990133077  6.82995550570086
##                                            PC3 tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 -38.4190093513565          LumB
## TCGA-A2-A0SY-01A-31R-A084-07  7.54869728087252          LumA
## TCGA-AR-A24Z-01A-11R-A169-07  11.3252895436753          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07  24.3111755582573          LumA
## TCGA-A1-A0SN-01A-11R-A144-07    21.69949616053          Her2
## TCGA-D8-A73W-01A-22R-A352-07 -12.9151476451899          LumB
p <- plot_ly(df.pca,
             x = ~PC1,
             y = ~PC2,
             z = ~PC3,
             type="scatter3d",
             mode = "markers",
             color = ~tumor_subtype,
             marker = list(size = 3))

p

Challenge: Do it in 4-D